{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "from tqdm import tqdm_notebook as tqdm\n",
    "from sklearn.externals import joblib\n",
    "from functools import partial\n",
    "\n",
    "%matplotlib inline\n",
    "import seaborn as sns\n",
    "import category_encoders as ce"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "DIR = '/PATH/TO/YOUR/DATA'\n",
    "description = pd.read_csv(os.path.join(DIR,'data/HomeCredit_columns_description.csv'),encoding = 'latin1')\n",
    "application = pd.read_csv(os.path.join(DIR, 'files/unzipped_data/application_train.csv'))\n",
    "previous_application = pd.read_csv(os.path.join(DIR, 'files/unzipped_data/previous_application.csv'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Solution 6\n",
    "### Hand crafted features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "features = pd.DataFrame({'SK_ID_CURR': application['SK_ID_CURR']})\n",
    "\n",
    "common_columns = [col for col in application.columns if col in previous_application.columns]\n",
    "application_common = application[common_columns]\n",
    "merged_tables = previous_application[common_columns + ['DAYS_DECISION']].merge(application_common, on='SK_ID_CURR',\n",
    "                                                                               how='right')\n",
    "merged_tables.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "merged_sorted = merged_tables.sort_values(['SK_ID_CURR', 'DAYS_DECISION'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "merged_sorted['annuity_diff'] = merged_sorted['AMT_ANNUITY_y'] - merged_sorted['AMT_ANNUITY_x']\n",
    "merged_sorted['annuity_ratio'] = merged_sorted['AMT_ANNUITY_y'] / merged_sorted['AMT_ANNUITY_x']\n",
    "merged_sorted['credit_diff'] = merged_sorted['AMT_CREDIT_y'] - merged_sorted['AMT_CREDIT_x']\n",
    "merged_sorted['credit_ratio'] = merged_sorted['AMT_CREDIT_y'] / merged_sorted['AMT_CREDIT_x']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "merged_sorted['the_same_contract_type'] = (\n",
    "    merged_sorted['NAME_CONTRACT_TYPE_x'] == merged_sorted['NAME_CONTRACT_TYPE_y']).astype(int)\n",
    "merged_sorted['the_same_weekday'] = (merged_sorted['WEEKDAY_APPR_PROCESS_START_x'] == merged_sorted['WEEKDAY_APPR_PROCESS_START_y']).astype(int)\n",
    "merged_sorted['hour_diff'] = merged_sorted['HOUR_APPR_PROCESS_START_x'] - merged_sorted['HOUR_APPR_PROCESS_START_y']\n",
    "merged_sorted['the_same_type_suite'] = (merged_sorted['NAME_TYPE_SUITE_x'] == merged_sorted['NAME_TYPE_SUITE_y']\n",
    "                                       ).astype(int)\n",
    "merged_sorted['the_same_type_suite'][merged_sorted['NAME_TYPE_SUITE_x'].isnull()] = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def _get_last_k_applications_feature_name(feature_name, number, suffix):\n",
    "    return 'application_previous_application_{}_last_{}_applications_{}'.format(feature_name, number, suffix)\n",
    "\n",
    "\n",
    "def get_last_k_credits_features(merged_sorted, numbers_of_applications):\n",
    "    features = pd.DataFrame({'SK_ID_CURR': merged_sorted['SK_ID_CURR'].unique()})\n",
    "    feature_list = ['annuity_diff', 'annuity_ratio', 'credit_diff', 'credit_ratio', 'the_same_contract_type',\n",
    "                        'the_same_type_suite', 'the_same_weekday', 'hour_diff']\n",
    "\n",
    "    for number in numbers_of_applications:\n",
    "        table_tail = merged_sorted.groupby('SK_ID_CURR').tail(number)\n",
    "        tail_groupby = table_tail.groupby('SK_ID_CURR')\n",
    "        g = tail_groupby[feature_list].agg('mean')\n",
    "\n",
    "        g = g.rename(axis='columns', mapper=partial(_get_last_k_applications_feature_name, number=number,\n",
    "                                        suffix='mean')).reset_index()\n",
    "\n",
    "        features = features.merge(g, how='left', on=['SK_ID_CURR'])\n",
    "    return features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "g = get_last_k_credits_features(merged_sorted, numbers_of_applications=[1,3,5,10])\n",
    "features = features.merge(g, on=['SK_ID_CURR'], how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = application.merge(features,\n",
    "                                left_on=['SK_ID_CURR'],\n",
    "                                right_on=['SK_ID_CURR'],\n",
    "                                how='left',\n",
    "                                validate='one_to_one')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "engineered_numerical_columns = list(features.columns)\n",
    "engineered_numerical_columns.remove('SK_ID_CURR')\n",
    "X = X[engineered_numerical_columns + ['TARGET']]\n",
    "X_corr = abs(X.corr())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "X_corr.sort_values('TARGET', ascending=False)['TARGET']"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
